set.seed(12345)
source("ingest_data.R")
MODEL_CACHE <- "added-M_pop_A_team_team_repo_RDC_zi_sd025_gamma05"

Settings

Models are stored in the following directory, which must exist prior to knitting this document:

cat(normalizePath(paste(getwd(), dirname(cachefile(MODEL_CACHE)), sep="/"), mustWork = T))
## /home/app/.cache

The used cache directory can be controlled via the cache parameter to Rmd - it can be useful to experiment with this parameter if you Knit the document manually in RStudio.

Full model, with team- and team/repo- variation for removed lines, existing complexity and existing number of clones

Compared to the previous model, we now add two more predictors, existing complexity and existing number of clones in the file, prior to the change.

Both of these additional predictors have the same structure as the previous added and removed lines - they are both zero-bounded, and highly right-skewed. Therefore, we log-transform them, and center the resulting logarithm, so the value used in the model is zero-centered with standard deviation 1.

This means that we can use the same priors as we did for the prior model, \(\mathcal{M}_1\). The only difference is that now our model incorporates many more predictors.

d <- data |> select(y=INTROD,
                    A=A,
                    C=C,
                    D=D,
                    R=R,
                    team=committerteam,
                    repo=repo)
formula <- bf(y ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo),
              zi ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo))
get_prior(data=d,
          family=zero_inflated_negbinomial,
          formula=formula)
##                    prior     class      coef     group resp dpar nlpar lb ub
##                   (flat)         b                                          
##                   (flat)         b         A                                
##                   (flat)         b         C                                
##                   (flat)         b         D                                
##                   (flat)         b         R                                
##                   lkj(1)       cor                                          
##                   lkj(1)       cor                team                      
##                   lkj(1)       cor           team:repo                      
##  student_t(3, -2.3, 2.5) Intercept                                          
##     student_t(3, 0, 2.5)        sd                                      0   
##     student_t(3, 0, 2.5)        sd                team                  0   
##     student_t(3, 0, 2.5)        sd         C      team                  0   
##     student_t(3, 0, 2.5)        sd         D      team                  0   
##     student_t(3, 0, 2.5)        sd Intercept      team                  0   
##     student_t(3, 0, 2.5)        sd         R      team                  0   
##     student_t(3, 0, 2.5)        sd           team:repo                  0   
##     student_t(3, 0, 2.5)        sd         C team:repo                  0   
##     student_t(3, 0, 2.5)        sd         D team:repo                  0   
##     student_t(3, 0, 2.5)        sd Intercept team:repo                  0   
##     student_t(3, 0, 2.5)        sd         R team:repo                  0   
##        gamma(0.01, 0.01)     shape                                      0   
##                   (flat)         b                            zi            
##                   (flat)         b         A                  zi            
##                   (flat)         b         C                  zi            
##                   (flat)         b         D                  zi            
##                   (flat)         b         R                  zi            
##           logistic(0, 1) Intercept                            zi            
##     student_t(3, 0, 2.5)        sd                            zi        0   
##     student_t(3, 0, 2.5)        sd                team        zi        0   
##     student_t(3, 0, 2.5)        sd         C      team        zi        0   
##     student_t(3, 0, 2.5)        sd         D      team        zi        0   
##     student_t(3, 0, 2.5)        sd Intercept      team        zi        0   
##     student_t(3, 0, 2.5)        sd         R      team        zi        0   
##     student_t(3, 0, 2.5)        sd           team:repo        zi        0   
##     student_t(3, 0, 2.5)        sd         C team:repo        zi        0   
##     student_t(3, 0, 2.5)        sd         D team:repo        zi        0   
##     student_t(3, 0, 2.5)        sd Intercept team:repo        zi        0   
##     student_t(3, 0, 2.5)        sd         R team:repo        zi        0   
##        source
##       default
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##       default
##  (vectorized)
##  (vectorized)
##       default
##       default
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##       default
##       default
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##       default
##       default
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
priors <- c(prior(normal(0, 0.5), class = Intercept),
            prior(normal(0, 0.25), class = b),
            prior(weibull(2, 0.25), class = sd),
            prior(weibull(2, 0.25), class = sd, group=team:repo),
            prior(lkj(2), class = cor),
            prior(normal(0, 0.5), class = Intercept, dpar=zi),
            prior(normal(0, 0.25), class = b, dpar=zi),
            prior(weibull(2, 0.25), class = sd, dpar=zi),
            prior(weibull(2, 0.25), class = sd, group=team:repo, dpar=zi),
            prior(gamma(0.5, 0.1), class = shape))

(v <- validate_prior(prior=priors,
               formula=formula,
               data=d,
               family=zero_inflated_negbinomial)
)
##                 prior     class      coef     group resp dpar nlpar lb ub
##       normal(0, 0.25)         b                                          
##       normal(0, 0.25)         b         A                                
##       normal(0, 0.25)         b         C                                
##       normal(0, 0.25)         b         D                                
##       normal(0, 0.25)         b         R                                
##       normal(0, 0.25)         b                            zi            
##       normal(0, 0.25)         b         A                  zi            
##       normal(0, 0.25)         b         C                  zi            
##       normal(0, 0.25)         b         D                  zi            
##       normal(0, 0.25)         b         R                  zi            
##        normal(0, 0.5) Intercept                                          
##        normal(0, 0.5) Intercept                            zi            
##  lkj_corr_cholesky(2)         L                                          
##  lkj_corr_cholesky(2)         L                team                      
##  lkj_corr_cholesky(2)         L           team:repo                      
##      weibull(2, 0.25)        sd                                      0   
##      weibull(2, 0.25)        sd                            zi        0   
##      weibull(2, 0.25)        sd                team                  0   
##      weibull(2, 0.25)        sd         C      team                  0   
##      weibull(2, 0.25)        sd         D      team                  0   
##      weibull(2, 0.25)        sd Intercept      team                  0   
##      weibull(2, 0.25)        sd         R      team                  0   
##      weibull(2, 0.25)        sd                team        zi        0   
##      weibull(2, 0.25)        sd         C      team        zi        0   
##      weibull(2, 0.25)        sd         D      team        zi        0   
##      weibull(2, 0.25)        sd Intercept      team        zi        0   
##      weibull(2, 0.25)        sd         R      team        zi        0   
##      weibull(2, 0.25)        sd           team:repo                  0   
##      weibull(2, 0.25)        sd         C team:repo                  0   
##      weibull(2, 0.25)        sd         D team:repo                  0   
##      weibull(2, 0.25)        sd Intercept team:repo                  0   
##      weibull(2, 0.25)        sd         R team:repo                  0   
##      weibull(2, 0.25)        sd           team:repo        zi        0   
##      weibull(2, 0.25)        sd         C team:repo        zi        0   
##      weibull(2, 0.25)        sd         D team:repo        zi        0   
##      weibull(2, 0.25)        sd Intercept team:repo        zi        0   
##      weibull(2, 0.25)        sd         R team:repo        zi        0   
##       gamma(0.5, 0.1)     shape                                      0   
##        source
##          user
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##          user
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##          user
##          user
##          user
##  (vectorized)
##  (vectorized)
##          user
##          user
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##          user
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##          user
##  (vectorized)
##  (vectorized)
##  (vectorized)
##  (vectorized)
##          user

The same structure of priors as in \(\mathcal{M}_1\) is used here—the only thing is that we have added more numerical predictors, scaled and centered like the

Prior Predictive Checks

M_ppc <- brm(data = d,
      family = zero_inflated_negbinomial,
      formula = formula,
      prior = priors,
      file = cachefile(paste0("ppc-",MODEL_CACHE)),
      warmup = 1000,
      iter  = ITERATIONS,
      chains = CHAINS,
      cores = CORES,
      sample_prior = "only",
      backend="cmdstanr",
      file_refit = "on_change",
      threads = threading(THREADS),
      save_pars = SAVE_PARS,
      adapt_delta = ADAPT_DELTA)
m <- M_ppc
yrep <- posterior_predict(m, newdata = d) # or nd

Number of zeros

ppc_stat(y = d$y, yrep, stat = function(y) mean(y == 0)) + ggtitle("Prior predicted proportion of zeros")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The general pattern that we have seen in both \(\mathcal{M}_0\) and \(\mathcal{M}_1\) remains. Our model expects between \(40%\), up to \(100%\) zeros, with the peak around \(75%\), and lower values at the extremes (especially low below \(50%\)).

Max predicted value

(sim_max <- ppc_stat(y = d$y, yrep, stat = "max") + ggtitle("Prior predicted max values")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Relative to the prior models, the maximum value is now more wild—ranging into the tens of thousands, which is clearly unrealistic.

If we scale the plot to more reasonable values, we see that there is higher probability of 500 to 1000 introduced clones than in \(\mathcal{M}_1\), and the most likely value is still below the observed maximum. So the observed value still fits well within the range of predictions.

sim_max + scale_x_continuous(limits = c(0,1000)) + ggtitle("Prior predicted max values up to 1000")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

99th percentile

Relative to \(\mathcal{M}_1\), our new model Q99 predictions range wider (up to 200, where \(\mathcal{M}_1\) had 100). But the general pattern fits the observations, and we know that, because we have more predictors, our model will swing more in its predictions.

ppc_stat(y = d$y, yrep, stat = function(y) quantile(y, 0.99)) + ggtitle("Prior predicted Q99 vs. observed value")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

99th vs 95th percentile

The same pattern repeats for the Q99-vs-Q95 plot. Somewhat wider predicted values, but still encompassing the observations.

p <- ppc_stat_2d(d$y, yrep, stat = c("q95", "q99")) + theme(legend.position="bottom") #+ ggtitle("Prior predicted Q95 vs Q99")
p

Standard deviation

Instead of ranging up to 1200, the standard deviation now ranges to about 3000. We need to scale the \(x\)-axis to see the real pattern.

(p <- ppc_stat(y = d$y, yrep, stat = "sd") + ggtitle("Prior predicted stddev vs. observed value")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Relative to \(\mathcal{M}_1\), this model has a wider standard deviation range, but it still encompass the observed value well.

p + scale_x_continuous(limits=c(0,30))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Group-level predictions

The group-level data is also wider than the prior model. The Brown team now ranges (based only on prior knowledge) up to 15000, and the Red team up to 4000.

ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$team) + theme(legend.position = "bottom") + ggtitle("Prior predictive Q99 observation per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Also this model will show Integration tests to be a bit of an anomaly, with predictions (based only on prior knowledge) ranging up to 9000. Because we have not yet trained our model, these predictions indicate that something is different (e.g. more added or less removed lines, different complexity) in this repository, compared to the others.

ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$repo) + theme(legend.position = "bottom") + ggtitle("Prior predictive Q99 observation per repository")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Model execution and diagnostics

M_pop_A_team_team_repo_zi_sd025_model <-
  brm(data = d,
      family = zero_inflated_negbinomial,
      file = cachefile(MODEL_CACHE),
      formula = formula,
      prior = priors,
      warmup = 1000,
      iter  = ITERATIONS,
      chains = CHAINS,
      cores = CORES,
      backend="cmdstanr",
      file_refit = "on_change",
      threads = threading(THREADS),
      save_pars = SAVE_PARS,
      adapt_delta = ADAPT_DELTA)
M_pop_A_team_team_repo_zi_sd025_model <- add_criterion(M_pop_A_team_team_repo_zi_sd025_model, "loo")
m <- M_pop_A_team_team_repo_zi_sd025_model
p <- mcmc_trace(m)
pars <- levels(p[["data"]][["parameter"]])
plots <- seq(1, to=length(pars), by=12)
lapply(plots, function(i) {
  start <- i
  end <- start+11
  mcmc_trace(m, pars = na.omit(pars[start:end]))
  })
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

## 
## [[31]]

## 
## [[32]]

## 
## [[33]]

## 
## [[34]]

## 
## [[35]]

## 
## [[36]]

## 
## [[37]]

## 
## [[38]]

## 
## [[39]]

## 
## [[40]]

## 
## [[41]]

## 
## [[42]]

## 
## [[43]]

## 
## [[44]]

## 
## [[45]]

## 
## [[46]]

## 
## [[47]]

## 
## [[48]]

## 
## [[49]]

## 
## [[50]]

## 
## [[51]]

## 
## [[52]]

## 
## [[53]]

## 
## [[54]]

## 
## [[55]]

## 
## [[56]]

## 
## [[57]]

## 
## [[58]]

## 
## [[59]]

## 
## [[60]]

## 
## [[61]]

## 
## [[62]]

## 
## [[63]]

## 
## [[64]]

## 
## [[65]]

## 
## [[66]]

## 
## [[67]]

## 
## [[68]]

The “caterpillar plots” show that the chains mixed well also for this model.

mcmc_plot(m, type="rhat")

mcmc_plot(m, type="rhat_hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mcmc_plot(m, type="neff")

mcmc_plot(m, type="neff_hist")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Both MCMC chains, \(\hat{R}\) and \(N_{eff}\) ratio looks good.

loo <- loo(m)
loo
## 
## Computed from 12000 by 31007 log-likelihood matrix.
## 
##          Estimate    SE
## elpd_loo  -8487.8 163.2
## p_loo       259.6  16.8
## looic     16975.5 326.4
## ------
## MCSE of elpd_loo is NA.
## MCSE and ESS estimates assume MCMC draws (r_eff in [0.2, 1.5]).
## 
## Pareto k diagnostic values:
##                          Count Pct.    Min. ESS
## (-Inf, 0.7]   (good)     30995 100.0%  192     
##    (0.7, 1]   (bad)          9   0.0%  <NA>    
##    (1, Inf)   (very bad)     3   0.0%  <NA>    
## See help('pareto-k-diagnostic') for details.
plot(loo)

This, more complex, model also has more outliers that need to be analyzed with reloo. We now have 12 data points with PSIS-estimated Pareto-k value greater than the 0.7 threshold.

Reloo is disabled by default, but could be enabled by setting the RELOO environment variable (which in turn will set the reloo parameter of this document.

reloofile <- cachefile(paste0("reloo-", MODEL_CACHE, ".rds"))
if (file.exists(reloofile)) {
    (reloo <- readRDS(reloofile))
} else {
    Sys.time()
    (reloo <- reloo(m, loo, chains=CHAINS, cores=CORES) )
    Sys.time()
    saveRDS(reloo, reloofile)
}
## 
## Computed from 12000 by 31007 log-likelihood matrix.
## 
##          Estimate    SE
## elpd_loo  -8492.8 163.8
## p_loo       264.6  18.7
## looic     16985.5 327.5
## ------
## MCSE of elpd_loo is 0.8.
## MCSE and ESS estimates assume MCMC draws (r_eff in [0.2, 1.5]).
## 
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.

Note that the reloo results are cached, and included in the docker image by default. If you want to reproduce the reloo manually, set a separate cache dir (and mount it into the docker image), and enable the RELOO environment variable. It will take more than one day to complete all reloo, so it is best done over a weekend.

# plotting the recalculated values
plot(reloo)

# which points have higher pareto_k than 0.5?
influencers <- data[loo::pareto_k_ids(reloo),]
influencers |> group_by(repo,committerteam) |> tally()
## # A tibble: 0 × 3
## # Groups:   repo [0]
## # ℹ 3 variables: repo <fct>, committerteam <fct>, n <int>

According to the reloo function, all Pareto-k values are below 0.7, so we should be able to trust the model inferences.

Posterior predictive checks

yrep <- posterior_predict(m)

Posterior proportion of zeros

ppc_stat(y = d$y, yrep, stat = function(y) mean(y == 0))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The distribution of zeros is marginally higher than the observed value, but it still encompasses the observed proportion.

Posterior max value

(sim_max <- ppc_stat(y = d$y, yrep, stat = "max")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Our more complex model are similar to model \(\mathcal{M}_1\), but expects a slightly wider range of values, up to 7500 duplicates (\(\mathcal{M}_1\) had up to 6000). But the most likely value is still close to the observed max value (150).

sim_max + scale_x_continuous(limits = c(0,1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

When scaling to \(0--1000\), we see that the most likely prediction of our model is actually somewhat less than 150 (observed max value). But then the predictions drop off towards 750, or even 1000.

Posterior standard distribution

ppc_stat(y = d$y, yrep, stat = "sd")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Same principle as for the other metrics—the standard deviation is slightly wider than for model \(\mathcal{M}_1\).

Posterior Q99

ppc_stat(y = d$y, yrep, stat = function(y) quantile(y, 0.99)) + ggtitle("Posterior predicted Q99")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Posterior Q95 vs Q99

ppc_stat_2d(d$y, yrep, stat = c("q95", "q99")) + ggtitle("Posterior predicted Q95 vs Q99")

The 95th and 99th quantiles are spot on the observed values of 1 and 5, respectively. Even smaller variations than \(\mathcal{M}_1\) are expected for Q99 (the histograms for 4 and 6 are smaller, and more evenly balanced in this model).

Posterior grouped predictions

ppc_stat_grouped(y = d$y, yrep, stat = "max", group = d$team) + ggtitle("Posterior predictive max observation per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The expected max value per team varies a lot between teams. For some teams (e.g. Arch, Red, Violet), the model is reasonably sure that it will fall between 150-400, while for other teams (Brown, Green, Orange), it would expect up to 8000)

ppc_stat_grouped(y = d$y, yrep, stat = "max", group = d$repo) + ggtitle("Posterior predictive max observation per repo")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Some repositories (rather, teams in some repositories, as our model do not consider repo-level variation separately), like Integration tests and Mars, expect up to 7500-8000 introduced clones. The others expect around 5-600, so an order of magnitude less.

ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$team) + ggtitle("Posterior predictive 99% quartile per team")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The 99th percentile value predictions seem very well fitted. Predictions surround the observation nicely, and the UI team continue to be the most varying (given that they have not contributed much in the data set).

ppc_stat_grouped(y = d$y, yrep, stat = "q99", group = d$repo) + ggtitle("Posterior predictive 99% quartile per repo")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Similar for the repository-level variation–the predictions surround the observed Q99 value nicely.

Rootogram

(rootogram <- pp_check(m, type = "rootogram", style="suspended")
)
## Using all posterior draws for ppc type 'rootogram' by default.

We need to size to rootgram scale to see anything of value.

topleft=data.frame(x=c(0, 0),
                   xend=c(17, 17), 
                   y=c(35, -4),
                   yend=c(22, 17))
main.plot <- rootogram + scale_x_continuous(limits = c(0,50), breaks = seq(from=0, to=50, by=5)) + scale_y_continuous(limits = c(-4,35), breaks = c(-5, 0, 5, 10, 15, 20, 25, 30, 35)) + geom_text(x=1, y=35, label="b)") + theme(legend.position="bottom")
inset.plot <- rootogram + theme(legend.position = "none") + scale_x_continuous(limits = c(0,150))  + geom_text(x=20, y=150, label="a)") + geom_rect(xmin=-3, xmax=50, ymin=-4, ymax=35, color="red", fill=NA)
p <- ggdraw() + draw_plot(main.plot) + draw_plot(inset.plot, x=0.25, y=.5, width = 0.5, height = 0.5)
## Warning: Removed 6611 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 6610 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Warning: Removed 6511 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 6509 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
p

We see that the suspended histogram “line up” around the \(x\)-axis, on both sides (like residuals usually do). This indicates that there is no systeming over- or under-predictions going on in our model, so we should be able to trust its predictions.

Posterior predictions

The posterior predictions in general work well. Though the single outlier in Saturn is not picked up (as most other changes there yield very few duplicates). Though our priors does allow the outliers to shine though, they do relegate them as very unlikely (most max predictions are in the 10s or 20s, most).

source("predict_utils.R")
p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity = mean(data$COMPLEX), duplicates = mean(data$DUP), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p

p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity = q99(data$COMPLEX), duplicates = q99(data$DUP), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p

p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity = median(data$COMPLEX), duplicates = median(data$DUP), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p

p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q99(data$ADD), removed=q99(data$DEL), complexity = quantile(data$COMPLEX, 0.25), duplicates = quantile(data$DUP, 0.25), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p

p <- heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), summary = function(x) { length(which(x>0))/length(x) }), "Probability of introduced duplicates", decimals=2)
p

heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), summary=function(x) q95(x)), "Quantile 95%", decimals=0)

heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), summary=function(x) q99(x)), "Quantile 99%", decimals=0)

heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q95(data$ADD), removed=q95(data$DEL), complexity=q95(data$COMPLEX), duplicates = q95(data$DUP), summary=function(x) q99(x)), "Quantile 99%", decimals=0)

heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=q99(data$ADD), removed=q99(data$DEL), complexity=q99(data$COMPLEX), duplicates = q99(data$DUP), summary=function(x) quantile(x, 0.75)), "Quantile 75%", decimals=0)

heatmap_by_team_and_repo(posterior_predict_by_team_and_repo(m, added=quantile(data$ADD, 0.99), removed=quantile(data$DEL, 0.1), complexity=mean(data$COMPLEX), duplicates = mean(data$DUP), summary=function(x) median(x)), "Median a posteriori", decimals=0)

summary(m)
##  Family: zero_inflated_negbinomial 
##   Links: mu = log; shape = identity; zi = logit 
## Formula: y ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo) 
##          zi ~ 1 + A + R + C + D + (1 + R + C + D | team) + (1 + R + C + D | team:repo)
##    Data: d (Number of observations: 31007) 
##   Draws: 4 chains, each with iter = 4000; warmup = 1000; thin = 1;
##          total post-warmup draws = 12000
## 
## Group-Level Effects: 
## ~team (Number of levels: 11) 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sd(Intercept)              0.25      0.11     0.06     0.49 1.00     3408
## sd(R)                      0.17      0.08     0.04     0.36 1.00     5339
## sd(C)                      0.21      0.09     0.05     0.41 1.00     5345
## sd(D)                      0.11      0.06     0.02     0.27 1.00     6498
## sd(zi_Intercept)           0.15      0.08     0.03     0.33 1.00     7160
## sd(zi_R)                   0.26      0.11     0.06     0.50 1.00     4323
## sd(zi_C)                   0.17      0.09     0.03     0.38 1.00     5740
## sd(zi_D)                   0.15      0.09     0.03     0.35 1.00     6821
## cor(Intercept,R)           0.14      0.36    -0.58     0.77 1.00     8427
## cor(Intercept,C)           0.06      0.36    -0.65     0.73 1.00     7264
## cor(R,C)                   0.11      0.36    -0.61     0.74 1.00     7527
## cor(Intercept,D)           0.14      0.38    -0.61     0.78 1.00    10810
## cor(R,D)                  -0.02      0.38    -0.72     0.70 1.00    10547
## cor(C,D)                   0.01      0.38    -0.71     0.71 1.00     9248
## cor(zi_Intercept,zi_R)     0.00      0.38    -0.71     0.71 1.00     4565
## cor(zi_Intercept,zi_C)    -0.11      0.38    -0.77     0.65 1.00     9848
## cor(zi_R,zi_C)             0.09      0.38    -0.65     0.76 1.00     9599
## cor(zi_Intercept,zi_D)     0.00      0.38    -0.70     0.72 1.00     9116
## cor(zi_R,zi_D)            -0.14      0.38    -0.79     0.63 1.00     9162
## cor(zi_C,zi_D)            -0.09      0.38    -0.76     0.66 1.00     8454
##                        Tail_ESS
## sd(Intercept)              4272
## sd(R)                      6096
## sd(C)                      5512
## sd(D)                      6839
## sd(zi_Intercept)           6955
## sd(zi_R)                   5679
## sd(zi_C)                   6057
## sd(zi_D)                   6914
## cor(Intercept,R)           9542
## cor(Intercept,C)           7527
## cor(R,C)                   8817
## cor(Intercept,D)           9075
## cor(R,D)                   9656
## cor(C,D)                   9466
## cor(zi_Intercept,zi_R)     7256
## cor(zi_Intercept,zi_C)     8522
## cor(zi_R,zi_C)             9515
## cor(zi_Intercept,zi_D)     9114
## cor(zi_R,zi_D)             9053
## cor(zi_C,zi_D)             9630
## 
## ~team:repo (Number of levels: 84) 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## sd(Intercept)              0.47      0.07     0.33     0.62 1.00     4365
## sd(R)                      0.32      0.06     0.21     0.45 1.00     3309
## sd(C)                      0.34      0.09     0.18     0.52 1.00     2242
## sd(D)                      0.21      0.05     0.12     0.33 1.00     4147
## sd(zi_Intercept)           0.59      0.08     0.44     0.75 1.00     5222
## sd(zi_R)                   0.35      0.08     0.21     0.52 1.00     3673
## sd(zi_C)                   0.40      0.11     0.19     0.64 1.00     2234
## sd(zi_D)                   0.36      0.08     0.21     0.54 1.00     4242
## cor(Intercept,R)          -0.22      0.21    -0.62     0.20 1.00     2999
## cor(Intercept,C)          -0.15      0.23    -0.56     0.33 1.00     4142
## cor(R,C)                   0.25      0.24    -0.25     0.70 1.00     3103
## cor(Intercept,D)           0.01      0.25    -0.49     0.49 1.00     5110
## cor(R,D)                  -0.31      0.26    -0.76     0.25 1.00     5429
## cor(C,D)                  -0.28      0.29    -0.79     0.32 1.00     3359
## cor(zi_Intercept,zi_R)    -0.35      0.21    -0.73     0.07 1.00     4222
## cor(zi_Intercept,zi_C)    -0.47      0.20    -0.82    -0.03 1.00     4486
## cor(zi_R,zi_C)             0.18      0.29    -0.38     0.73 1.00     2621
## cor(zi_Intercept,zi_D)     0.07      0.22    -0.37     0.49 1.00     5593
## cor(zi_R,zi_D)             0.15      0.26    -0.36     0.64 1.00     4259
## cor(zi_C,zi_D)             0.02      0.30    -0.54     0.60 1.00     2180
##                        Tail_ESS
## sd(Intercept)              5681
## sd(R)                      6079
## sd(C)                      4929
## sd(D)                      7345
## sd(zi_Intercept)           7108
## sd(zi_R)                   6488
## sd(zi_C)                   3582
## sd(zi_D)                   6576
## cor(Intercept,R)           5823
## cor(Intercept,C)           6628
## cor(R,C)                   4437
## cor(Intercept,D)           7430
## cor(R,D)                   7367
## cor(C,D)                   5783
## cor(zi_Intercept,zi_R)     5954
## cor(zi_Intercept,zi_C)     7074
## cor(zi_R,zi_C)             4575
## cor(zi_Intercept,zi_D)     8124
## cor(zi_R,zi_D)             6949
## cor(zi_C,zi_D)             5469
## 
## Population-Level Effects: 
##              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept       -1.51      0.15    -1.80    -1.21 1.00     4476     6927
## zi_Intercept     1.90      0.16     1.58     2.20 1.00     4608     5601
## A                1.07      0.05     0.98     1.16 1.00     6999     8776
## R                0.09      0.09    -0.09     0.28 1.00     5405     7117
## C                0.30      0.12     0.05     0.53 1.00     4636     7122
## D                0.29      0.08     0.12     0.45 1.00     7159     8364
## zi_A            -1.16      0.06    -1.27    -1.05 1.00     8230     8802
## zi_R             0.43      0.12     0.16     0.65 1.00     5160     7483
## zi_C             0.33      0.13     0.06     0.59 1.00     4471     6632
## zi_D            -0.34      0.10    -0.54    -0.13 1.00     6227     8337
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape     1.04      0.09     0.88     1.22 1.00     8533     8853
## 
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
source("conditional_effects.R")
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Blue", "Jupiter", added=q95(data$ADD), removed=q95(data$DEL)), "Blue", "Jupiter")

(red_jupiter_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Jupiter", added=q95(data$ADD), removed=q95(data$DEL)), "Red", "Jupiter") + labs(title=NULL, subtitle=NULL, legend=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + theme_bw() +  scale_y_continuous(limits=c(0,50))
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

(red_uranus_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Uranus", added=q95(data$ADD), removed=q95(data$DEL)), "Red", "Uranus") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + theme_bw() + ylab(NULL) + scale_y_continuous(limits=c(0,50))
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

(red_plot <- ggarrange(red_jupiter_plot, red_uranus_plot, common.legend = T, legend="bottom"))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

figsave("red_jupiter_uranus.pdf", red_plot)
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Green", "Jupiter", added=q99(data$ADD), removed=q95(data$DEL)), "Green", "Jupiter")

(blue_jupiter_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Blue", "Jupiter", added=q99(data$ADD), removed=q95(data$DEL)), "Blue", "Jupiter") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + scale_y_continuous(limits=c(0,260)) + theme(legend.position = "none") + theme_bw()
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

(blue_uranus_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Blue", "Uranus", added=q99(data$ADD), removed=q95(data$DEL)), "Blue", "Uranus") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10)) + scale_y_continuous(limits=c(0,260)) + ylab(NULL) + theme_bw()
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

(blue_plot <- ggarrange(blue_jupiter_plot, blue_uranus_plot, common.legend = T, legend="bottom"))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

figsave("blue_jupiter_uranus.pdf", blue_plot)
(complexity_plot <- ggarrange(plotlist = list(blue_jupiter_plot, blue_uranus_plot, red_jupiter_plot, red_uranus_plot), labels="AUTO", nrow=2, ncol=2, common.legend = T, legend="bottom") 
 ) 
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
## Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

figsave("blue_red_complexity.pdf", complexity_plot)
plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Uranus", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Uranus") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

(red_venus_plot <- plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Venus", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Venus") + labs(title=NULL, subtitle=NULL) + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
)
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Neptune", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Neptune") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Saturn", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Saturn") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "IntTest", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "IntTest") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Mars", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Mars") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

plot_logCOMPLEX_by_logDUP(m, d, condeffect_logCOMPLEX_by_logDUP(m, d, "Red", "Mercury", added=q99(data$ADD), removed=q95(data$DEL)), "Red", "Mercury") + scale_x_continuous(limits=c(0,max(data$COMPLEX)+10))
## Warning: Removed 492 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

data |> filter(committerteam=="Blue", repo=="Uranus") |> group_by(INTROD) |> tally() |> arrange(desc(n))
## # A tibble: 7 × 2
##   INTROD     n
##    <int> <int>
## 1      0   968
## 2      1    44
## 3      2    23
## 4      3     5
## 5      4     4
## 6      5     2
## 7     17     1
data |> filter(committerteam=="Red", repo=="Jupiter") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 13 × 2
##    INTROD     n
##     <int> <int>
##  1      0  2080
##  2      1    35
##  3      2    18
##  4      3     2
##  5      4     9
##  6      5     4
##  7      6     3
##  8      7     8
##  9      8     2
## 10      9     2
## 11     19     1
## 12     22     1
## 13     23     1
data |> filter(committerteam=="Red", repo=="Venus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 6 × 2
##   INTROD     n
##    <int> <int>
## 1      0   214
## 2      1     5
## 3      2     7
## 4      5     2
## 5      6     1
## 6     11     2
data |> filter(committerteam=="Red", repo=="Uranus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 5 × 2
##   INTROD     n
##    <int> <int>
## 1      0   348
## 2      1    19
## 3      2    11
## 4      3     7
## 5      4     1
data |> filter(committerteam=="Red", repo=="Venus") |> tally()
##     n
## 1 231
data |> filter(committerteam=="Red", repo=="Venus") |> group_by(trunc(COMPLEX/5)) |> tally()
## # A tibble: 32 × 2
##    `trunc(COMPLEX/5)`     n
##                 <dbl> <int>
##  1                  0    40
##  2                  1    18
##  3                  2    19
##  4                  3    26
##  5                  4    19
##  6                  5    16
##  7                  6     8
##  8                  7     2
##  9                  8     5
## 10                  9     7
## # ℹ 22 more rows
data |> filter(committerteam=="Red", repo=="Jupiter") |> tally()
##      n
## 1 2166
data |> filter(committerteam=="Red", repo=="Jupiter") |> group_by(trunc(COMPLEX/5)) |> tally()
## # A tibble: 89 × 2
##    `trunc(COMPLEX/5)`     n
##                 <dbl> <int>
##  1                  0   413
##  2                  1   301
##  3                  2   223
##  4                  3   179
##  5                  4   131
##  6                  5   138
##  7                  6    77
##  8                  7    67
##  9                  8    62
## 10                  9    33
## # ℹ 79 more rows
data |> filter(committerteam=="Red") |> group_by(repo) |> tally() |> arrange(desc(n))
## # A tibble: 8 × 2
##   repo        n
##   <fct>   <int>
## 1 Jupiter  2166
## 2 Saturn   1191
## 3 IntTest   617
## 4 Uranus    386
## 5 Venus     231
## 6 Neptune   215
## 7 Mercury   211
## 8 Mars      130
data |> filter(committerteam=="Blue", repo=="Venus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 7 × 2
##   INTROD     n
##    <int> <int>
## 1      0   654
## 2      1     8
## 3      2     4
## 4      3     1
## 5      4     1
## 6      5     1
## 7      8     1
data |> filter(committerteam=="Blue", repo=="Uranus") |> group_by(INTROD) |> tally() |> arrange(INTROD)
## # A tibble: 7 × 2
##   INTROD     n
##    <int> <int>
## 1      0   968
## 2      1    44
## 3      2    23
## 4      3     5
## 5      4     4
## 6      5     2
## 7     17     1
data |> filter(COMPLEX==max(COMPLEX))
##      repo                                   commit file ISTEST ISNEW ISREMOVED
## 1 Jupiter 52efadf06bb61941887114298aa281c277e1f8a0  910  FALSE FALSE     FALSE
## 2 Jupiter 6de4ea0ed3207a898be965f3901efc5f0339bc0e  910  FALSE FALSE     FALSE
##                                     author authorteam
## 1 8ff0b36b3305550dc55c9097f3b970c599959326     Yellow
## 2 230d4e806215564296e126c9b532f1936adaaa89        Red
##                                  committer committerteam churn addedComplexity
## 1 0e33fff6868e21d94d344c4bf2fcd81285b2c3e2          Arch     3               0
## 2 230d4e806215564296e126c9b532f1936adaaa89           Red   113              10
##   removedComplexity cADD cDEL cSIZE cINTROD cREMOVED ADD DEL REASON CLOC
## 1                 0  582  879  1461       0        0   3   3      I 9443
## 2                 0  135   34   169       0        0 113  32      F 9443
##   COMPLEX DUP  logcADD  logcDEL logcSIZE logcREMOVED   logADD   logDEL
## 1    1244  25 6.368187 6.779922 7.287561           0 1.386294 1.386294
## 2    1244  25 4.912655 3.555348 5.135798           0 4.736198 3.496508
##   logCOMPLEX   logDUP INTROD REMOVED          A          R        C  scaleDUP
## 1   7.126891 3.258097      0       0 -0.4751701 -0.1458825 2.537155 0.5413155
## 2   7.126891 3.258097      0       0  1.6152839  1.2642792 2.537155 0.5413155
##          D         cA        cR        cSZ       cREM   cREMlog
## 1 2.058662 0.67129576  1.044691  0.6849683 -0.2146616 -0.455959
## 2 2.058662 0.03418981 -0.252292 -0.3619011 -0.2146616 -0.455959
teamBlue <- condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Jupiter")
teamArch <- condeffect_logADD_by_logCOMPLEX(m, d, "Arch", "Jupiter")
plot_logADD_by_logCOMPLEX(m, d, teamBlue)

library(forcats)
reloo_plot_logADD_by_logCOMPLEX <- function(reloo, someData, ftot, aTeam, aRepo) {
  tmp <- bind_cols(someData, reloo$diagnostics) |> mutate(
    truncC=factor(trunc(unscale_complexity(round(C)))), 
    added=unscale_added(A), 
    complexity=factor(trunc(unscale_complexity(round(C)))))
  sorted_labels <- paste(sort(as.integer(levels(tmp$truncC))))
  tmp$truncC <- factor(tmp$truncC, levels = sorted_labels)
  tmp$complexity <- factor(tmp$complexity, levels = sorted_labels)
  
  observed <- tmp |> filter(team == aTeam, repo == aRepo)
  
  tmp <- ftot |> mutate(added=unscale_added(A),
                        complexity=factor(trunc(unscale_complexity(as.integer(C))), levels=trunc(unscale_complexity(C)))) 
  predicted <- tmp
  return(predicted |> ggplot(aes(x=added)) +
    geom_smooth(aes(y=Estimate, ymin=Q5.5, ymax=Q94.5, group=complexity, color=complexity), stat="identity", alpha=.25, linewidth=.5) +
    geom_point(data=observed, aes(y=y, size = pareto_k, color=truncC), alpha=0.2) +
      ggtitle(paste0("Conditional effects of team ", aTeam, " in repo ", aRepo))
  )
}
# ! factor level [6] is duplicated
#reloo_plot_logADD_by_logCOMPLEX(reloo, d, teamBlue, "Blue", "Jupiter")
teamBlue |> mutate(added=unscale_added(A), complexity=factor(trunc(unscale_complexity(as.integer(C))))) |> select(complexity) |> summary()
##  complexity
##  14 :6000  
##  489:4000
d |> sample_n(10) |> select(y, A, C, team, repo) |> mutate(foo = trunc(unscale_complexity(round(C))))
##    y           A          C   team    repo foo
## 1  2  1.46776930 -0.2997277 Orange Jupiter  14
## 2  0  1.34561803  2.0975739    Red IntTest 489
## 3  0  0.89597273  1.5101936 Violet IntTest 489
## 4  0 -0.04262272  1.2796068  Brown  Saturn  85
## 5  0 -0.33592097  0.3931244 Yellow   Venus  14
## 6  0 -0.65469353  0.3931244   Blue    Mars  14
## 7  0  1.07549612 -0.5330614   Blue  Saturn   2
## 8  0  0.42775652 -0.6379823  Brown  Saturn   2
## 9  0 -0.04262272  0.9503251   Blue  Uranus  85
## 10 1  1.00685182  0.3931244   Blue Jupiter  14
  #mutate(truncC=factor(trunc(unscale_complexity(round(C, 0)))), added=unscale_added(A), complexity=factor(trunc(unscale_complexity(round(C, 0))))) |> select(complexity) 
plot_logADD_by_logCOMPLEX(m, d, teamArch)

teamBrown <- condeffect_logADD_by_logCOMPLEX(m, d, "Brown", "IntTest")
plot_logADD_by_logCOMPLEX(m, d, teamBrown)

teamUI <- condeffect_logADD_by_logCOMPLEX(m, d, "UI", "Jupiter")
plot_logADD_by_logCOMPLEX(m, d, teamUI)

(p <- mcmc_areas_ridges(m, regex_pars = c("^b_")) + theme_bw() + ggtitle("Population-level beta parameter distributions"))

(p <- mcmc_areas(m, regex_pars = c("^b_")) + theme_bw() + ggtitle("Population-level beta parameter distributions"))

figsave("pop_level_betas.pdf", p)
mcmc_areas(m, regex_pars = c("^sd_"))

mcmc_areas(m, regex_pars = c("^r_team[[].*,Intercept"))

mcmc_areas(m, regex_pars = c("^r_team.*[[]Red,.*[]]"))

mcmc_areas(m, regex_pars = c("^r_team:repo.*[[].*Red_IntTest,.*[]]"))

plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Brown", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Blue", "IntTest", robust=T)) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Red", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Blue", "Neptune")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Blue", "Uranus")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

plot_logCOMPLEX_by_logADD(m, d, condeffect_logCOMPLEX_by_logADD(m, d, "Red", "Jupiter")) + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "IntTest", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "IntTest", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Neptune", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

(p <- plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Green", "Jupiter", removed=q95(data$DEL), duplicates=q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + ylab("Estimated new duplicates") + xlab("Added lines (log scale)") + scale_y_continuous(limits=c(0,215))
)

figsave("condeffect_added_vs_complex_green_jupiter.pdf", p)
(p <- plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Jupiter", removed=q95(data$DEL), duplicates=q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + ylab("Estimated new duplicates") + xlab("Added lines (log scale)") + scale_y_continuous(limits=c(0,250))
)

(p <- plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "Jupiter", removed=q95(data$DEL), duplicates=q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + ylab("Estimated new duplicates") + xlab("Added lines (log scale)") + scale_y_continuous(limits=c(0,250))
)

figsave("condeffect_added_vs_complex_red_jupiter.pdf", p)
plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Blue", "Jupiter", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) #+ scale_y_continuous(limits=c(0,175))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "Jupiter", removed=200, duplicates=20)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))+ scale_y_continuous(limits=c(0,175))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Blue", "Neptune", removed=200, complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) #+ scale_y_continuous(limits=c(0,175))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Red", "Neptune", removed=200, complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Blue", "Jupiter", removed=200, complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()

jupiter_stats <- data |> filter(repo == "Jupiter") |> summarize(remQ95 = q95(DEL), compQ95 = q95(COMPLEX), remQ99 = q99(DEL), compQ99 = q99(COMPLEX))
plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Red", "Jupiter", removed=jupiter_stats$remQ95, complexity = jupiter_stats$compQ95)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Blue", "IntTest", removed=jupiter_stats$remQ95, complexity=jupiter_stats$compQ95)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw() + scale_y_continuous(limits = c(0,400))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Red", "IntTest", removed=q95(data$DEL), complexity=q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Brown", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000)) + theme_bw()

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Newbies", "IntTest")) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "NewTeam", "Saturn")) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Red", "Saturn", removed=0, duplicates = 0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Green", "Saturn", removed=q95(data$DUP), duplicates = q95(data$DUP))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Orange", "Venus", removed=0, duplicates = 0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Yellow", "Venus", removed=0, duplicates=0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logCOMPLEX(m, d, condeffect_logADD_by_logCOMPLEX(m, d, "Green", "Venus", removed=0, duplicates=0)) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Brown", "Jupiter", removed=q95(data$DEL), complexity = q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

plot_logADD_by_logDUP(m, d, condeffect_logADD_by_logDUP(m, d, "Newbies", "Jupiter", removed=q95(data$DEL), complexity = q95(data$COMPLEX))) + scale_x_continuous(trans="log1p", breaks=c(0,10,100,1000, 4000))

source("predict_utils.R")

Q95 change in a highly complex file

(p <- plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Green"), repo=c("Jupiter", "IntTest", "Aniara", "Venus"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=q95(data$DUP), complex=q95(data$COMPLEX))) + scale_y_continuous(limits = c(0.25,1.0)) + scale_x_continuous(limits = c(0,30)) )
## Warning: Removed 4713 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).

duplicates_probability_for_team <- function(predictions, aTeam) {
  params = predictions |> select(added, removed, complexity, duplicates) |> distinct()
  stopifnot(length(params$added) == 1)
  predictions |> filter(team == aTeam) |> ggplot(aes(x=repo, y=pred0/100, color=team)) + geom_boxplot() + scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + 
    ggtitle(paste("Probability of any duplicate per repository for team", aTeam), 
            paste("added", round(params$added, 0), "removed", round(params$removed, 0), "complexity", round(params$complexity, 0), "duplicates", round(params$duplicates, 0))) + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ylab("P(INTROD > 0)") + scale_y_continuous(limits=c(0,1))
}

more_five_probability_for_team <- function(predictions, aTeam) {
  params = predictions |> select(added, removed, complexity, duplicates) |> distinct()
  stopifnot(length(params$added) == 1)
  predictions |> filter(team == aTeam) |> ggplot(aes(x=repo, y=pred5/100, color=team)) + geom_boxplot() + scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + 
    ggtitle(paste("Probability of more than five duplicates per repository for team", aTeam), 
            paste("added", round(params$added, 0), "removed", round(params$removed, 0), "complexity", round(params$complexity, 0), "duplicates", round(params$duplicates,0))) + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ylab("P(INTROD > 5)") + scale_y_continuous(limits=c(0,1))
}
pred <- onepred(added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))
(p <- duplicates_probability_for_team(pred, "Blue"))

(p <- more_five_probability_for_team(pred, "Blue"))

Observed number of introduced duplicates.

(p <- data |> filter(repo == MARS, committerteam == GREEN) |> ggplot(aes(x=DUP, y=COMPLEX, size=INTROD, color=INTROD)) + geom_point() + geom_jitter() + scale_y_continuous(breaks=c(0,1,2,5, 10, 25, 50, 75, 100, 250, 500, 750), trans="log1p") + scale_x_continuous(breaks=c(0,1,2,5, 10, 25, 50, 75, 100), trans="log1p") + scale_color_distiller(palette = "Spectral") + theme_bw()
)

data |> filter(repo == MARS) |> group_by(committerteam) |> filter(INTROD == max(INTROD)) |> select(committerteam, INTROD, DUP) |> arrange(desc(INTROD))
## # A tibble: 28 × 3
## # Groups:   committerteam [10]
##    committerteam INTROD   DUP
##    <fct>          <int> <int>
##  1 Brown             24    86
##  2 Blue              14    72
##  3 Green             10     0
##  4 Violet             5     0
##  5 Yellow             2     1
##  6 Yellow             2     1
##  7 Red                2     0
##  8 Yellow             2     0
##  9 Orange             1     1
## 10 Orange             1     1
## # ℹ 18 more rows
(p <- data |> filter(repo == MARS, committerteam == GREEN) |> group_by(INTROD) |> ggplot(aes(x=DUP, group=INTROD, fill=INTROD)) + geom_bar(position="stack") + scale_x_continuous(breaks=c(0,1,2,5, 10, 25, 50, 75, 100), trans="log1p") + scale_fill_distiller(palette = "Spectral") + theme_bw()
)

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Green"), repo=c("Uranus", "Jupiter", "IntTest", "Mars"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=q95(data$DUP), complex=q95(data$COMPLEX))) + scale_y_continuous(limits = c(0.50,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 3226 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).

Letting the teams work in new repo

allrepos <- c("Aniara", "Jupiter", "IntTest", "Mercury", "Venus", "Mars", "Saturn", "Neptune", "Uranus")

(p <- plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Brown"), repo=allrepos, added=q95(data$ADD), removed=q95(data$DEL), duplicates=q95(data$DUP), complex=q95(data$COMPLEX))) + scale_y_continuous(limits = c(0.20,1.0)) + scale_x_continuous(limits = c(0,30)))
## Warning: Removed 10238 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).

(p <- plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam", "Green"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=median(data$DUP), complex=median(data$COMPLEX))) + scale_y_continuous(limits = c(0.70,1.0)) + scale_x_continuous(limits = c(0,30)))
## Warning: Removed 41 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Red", "NewTeam"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), duplicates=20, complex=170)) + scale_y_continuous(limits = c(0.35,1)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 1225 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).

data |> summarize(q95(ADD), q95(DEL), q95(COMPLEX), q95(DUP))
##   q95(ADD) q95(DEL) q95(COMPLEX) q95(DUP)
## 1      143       92          282       36
data |> group_by(repo) |> summarize(q95(ADD), q95(DEL), q95(COMPLEX), q95(DUP))
## # A tibble: 8 × 5
##   repo    `q95(ADD)` `q95(DEL)` `q95(COMPLEX)` `q95(DUP)`
##   <fct>        <dbl>      <dbl>          <dbl>      <dbl>
## 1 IntTest       216       127.           558          110
## 2 Jupiter       120        74.4          177           19
## 3 Mars          146.      114            154.           5
## 4 Mercury       144       106            282            6
## 5 Neptune       231       156            180            9
## 6 Saturn        101.       72.6          320            9
## 7 Uranus        158.       95.9           89.9          4
## 8 Venus         154        84.5          132            4
data |> group_by(repo) |> summarize(q99(ADD), q99(DEL), q99(COMPLEX), q99(DUP))
## # A tibble: 8 × 5
##   repo    `q99(ADD)` `q99(DEL)` `q99(COMPLEX)` `q99(DUP)`
##   <fct>        <dbl>      <dbl>          <dbl>      <dbl>
## 1 IntTest       638.       511.           633       620. 
## 2 Jupiter       298.       246.          1056        32  
## 3 Mars          316.       557.           661        72  
## 4 Mercury       304.       283.           374.       16.9
## 5 Neptune       511        474            216        28  
## 6 Saturn        239        209.           758        18  
## 7 Uranus        291.       273.           126.        9  
## 8 Venus         433        374.           260        11

Strange that teams in the Neptune repo seems very simlar for this change? Neptune repo is mostly static, no team owning it. Mostly abandoned code.

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "NewTeam", "Orange"), repo=c("Uranus", "Jupiter", "Venus"), added=q95(data$ADD), removed=q95(data$DEL))) + scale_y_continuous(limits = c(0.6,1)) + scale_x_continuous(limits = c(0,20))
## Warning: Removed 359 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "NewTeam", "Orange"), repo=c("Uranus", "Jupiter", "IntTest"), added=mean(data$ADD), removed=mean(data$DEL))) + scale_y_continuous(limits = c(0.2,1.02)) + scale_x_continuous(limits = c(0,20))
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "Arch", "NewTeam"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = mean(data$COMPLEX), duplicates=mean(data$DUP))) + scale_y_continuous(limits = c(0.2,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 328 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Blue", "Red", "Arch", "NewTeam"), repo=c("Uranus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))) + scale_y_continuous(limits = c(0.3,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 2114 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Green", "Yellow", "Orange", "NewTeam"), repo=c("Venus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))) + scale_y_continuous(limits = c(0.5,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 2403 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Brown","Red", "Green", "Yellow", "Orange", "NewTeam"), repo=c("Venus", "Jupiter", "IntTest"), added=q95(data$ADD), removed=q95(data$DEL), complexity = q95(data$COMPLEX), duplicates=q95(data$DUP))) + scale_y_continuous(limits = c(0.2,1.0)) + scale_x_continuous(limits = c(0,30))
## Warning: Removed 6318 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_step()`).

plot_cumulative_prob_of_duplicates(predict_for_team(m, c("Arch", "Blue", "Brown","Red", "Green", "Yellow", "Orange", "NewTeam"), repo=c("Venus", "Jupiter", "IntTest"), added=q99(data$ADD), removed=q99(data$DEL), complexity = median(data$COMPLEX), duplicates=median(data$DUP))) + scale_y_continuous(limits = c(0.5,1.0)) + scale_x_continuous(limits = c(0,20))
## Warning: Removed 2651 rows containing non-finite outside the scale range
## (`stat_ecdf()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_step()`).